### Statistics on baseline viewing for comparison with treatment effect sizes

library(tidyverse)
library(magrittr)
library(data.table)
library(lubridate)
library(hms)
library(ggthemes)
theme_set(theme_few())

### SET WORKING DIRECTORY HERE ###
path_to_archive <- "replication/"
data_dir <- paste0(path_to_archive, "data/")
setwd(data_dir)
plot_dir <- paste0(path_to_archive, "plots/")
tables_dir <- paste0(path_to_archive, "tables/")

get_prev_24h_time <- function(view_beg_time, view_end_time, ad_time) {
	window_start <- ad_time - dhours(24)
	window_end <- ad_time

	(pmin(view_end_time, window_end) - pmax(view_beg_time, window_start)) %>% 
		time_length(unit="minute") %>% 
		pmax(0) %>%
		sum(na.rm=T)
}


get_mins <- function(d){
	# get news minutes for all devices in T group for each ad
	
	cat(as.character(d), "\n")

	cat("\tLoading T/C group definition...\n")
	t_c_by_ad <- readRDS(paste0(data_dir, "dd/tc_groups_", as.character(d), ".rds")) %>%
		mutate(n_t = map_dbl(t_c, ~ .[group=="T", .N]),
			   n_c1 = map_dbl(t_c, ~ .[group=="C1", .N]),
			   n_c2 = map_dbl(t_c, ~ .[group=="C2", .N]),
			   n_c12 = map_dbl(t_c, ~ .[group=="C1+C2", .N])) %>%
		filter(n_t > 0, (n_c1 + n_c2 + n_c12 > 0))
	
	cat("\tReading news viewing intervals...\n")	
	news_view_yday <- readRDS(paste0(data_dir, "news_intervals/news_intervals_", as.character(d-1), ".rds"))
	news_view_tday <- readRDS(paste0(data_dir, "news_intervals/news_intervals_", as.character(d), ".rds"))
	news_view_tmw <- readRDS(paste0(data_dir, "news_intervals/news_intervals_", as.character(d+1), ".rds"))

	news_view <- rbindlist(list(news_view_yday,news_view_tday,news_view_tmw)) %>% 
		.[,.(device_id, event_time_utc, event_time_utc_end)] %>%
		setkey(device_id)


	dev_stack <- t_c_by_ad %>% 
		unnest(cols=c(t_c)) %>%  
		rename(ad_time_utc=event_time_utc) %>% 
		as.data.table %>%
		.[group=="T", .(device_id, ad_time_utc)] %>%
		unique %>%
		setkey(device_id)

	expand_view <- news_view[dev_stack, nomatch=0, allow.cartesian = T]

	expand_view[,.(news_time = get_prev_24h_time(event_time_utc, event_time_utc_end, ad_time_utc)),by=.(device_id, ad_time_utc)] %>%
		.[dev_stack, on = .(device_id, ad_time_utc)] %>%
		.[is.na(news_time),news_time:=0] %>%
		.[,date:=d] 
}


seq(mdy("09/01/2012"), mdy("11/06/2012"), by = "days") %>%
	map(get_mins) %>%
	rbindlist -> 
baseline_time

baseline_time_byad <- baseline_time[,.(avg_news_time = mean(news_time, na.rm=T), n_devs = .N), by = .(date, ad_time_utc)]
baseline_time_bydev <- baseline_time[,.(avg_news_time = mean(news_time, na.rm=T), n_ads = .N), by = .(device_id)]

baseline_time_bydev %>% saveRDS(paste0(data_dir, "baseline_time_by_device.rds"))


quantile(baseline_time_byad$avg_news_time, probs=c(0.1, 0.25, 0.5, 0.75, 0.9))
mean(baseline_time_byad$avg_news_time)

quantile(baseline_time_bydev$avg_news_time, probs=c(0.1, 0.25, 0.5, 0.75, 0.9))
mean(baseline_time_bydev$avg_news_time)


daily_baseline <- baseline_time[,.(mean_time = mean(news_time), med_time = median(news_time), n_dev = .N), by =.(date)]

daily_baseline %>% saveRDS("baseline_time_daily.rds")

